Automatic Cell Annotatiton: decoupler - Ensemble of methods to infer biological activities
Citation: Badia-i-Mompel P., Vélez Santiago J., Braunger J., Geiss C., Dimitrov D., Müller-Dott S., Taus P., Dugourd A., Holland C.H., Ramirez Flores R.O. and Saez-Rodriguez J. 2022. decoupleR: ensemble of computational methods to infer biological activities from omics data. Bioinformatics Advances. https://doi.org/10.1093/bioadv/vbac016
import scanpy as sc #software suite of tools for single-cell analysis in python
import besca as bc #internal BEDA package for single cell analysis
import decoupler as dc #automatic nnotation tool
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy
import anndata as ad
from scipy.sparse import csr_matrix
import scanpy.external as sce
from harmony import harmonize
import umap.umap_ as umap
print(ad.__version__)
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpne5up9ur INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpne5up9ur/_remote_module_non_scriptable.py INFO:lightning_fabric.utilities.seed:Global seed set to 0
0.9.1
Displaying result settings required for single cell analysis
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.3 anndata==0.9.1 umap==0.3.10 numpy==1.22.4 scipy==1.10.1 pandas==1.5.3 scikit-learn==1.2.2 statsmodels==0.14.0 python-igraph==0.10.4 louvain==0.8.0 pynndescent==0.5.10
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/_settings.py:447: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`
#Reading last saved annoatated data object written in h5ad data format.
#We used similar adata variable to make similar previous data analysis
save_file = '/home/jana/Updated_SCANPY_QC_filtered_BAL_for_Sarcoidosis.h5ad'
adata=sc.read_h5ad(save_file)
Displaying reading data
adata
AnnData object with n_obs × n_vars = 74334 × 14143
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'umap_density_sample', 'umap_density_leiden', 'leiden_1.0', 'leiden_0.8', 'leiden_0.5', 'leiden_0.4'
uns: 'doublet_info_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.8_colors', 'leiden_1.0_colors', 'leiden_colors', 'neighbors', 'phase_colors', 'sample_colors', 'umap', 'umap_density_leiden_params', 'umap_density_sample_params'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
obsp: 'connectivities', 'distances'
# we need to redefine the mito_genes, hb_genes,
#since they were first calculated on the full object before removing low expressed genes.
print ("Removing some unwanted genes named MALAT1")
malat1 = adata.var_names.str.startswith('MALAT1')
mito_genes = adata.var_names.str.startswith('MT-')
hb_genes = adata.var_names.str.contains('^HB[^(P)]')
rb_gene1 = adata.var_names.str.startswith('RPS')
rb_gene2 = adata.var_names.str.startswith('RPL')
remove = np.add(mito_genes, malat1)
remove = np.add(remove, hb_genes)
remove = np.add(remove, rb_gene1)
remove = np.add(remove, rb_gene2)
keep_updated = np.invert(remove)
adata = adata[:,keep_updated]
print(adata.n_obs, adata.n_vars)
Removing some unwanted genes named MALAT1 74334 14034
Omnipath: A largest available databases of curated prior knowledge. PanglaoDB: A database of cell type markers accessed through Omnipath
# Query Omnipath and get PanglaoDB
markers = dc.get_resource('PanglaoDB')
markers
INFO:root:Downloading data from `https://omnipathdb.org/queries/enzsub?format=json` INFO:root:Downloading data from `https://omnipathdb.org/queries/interactions?format=json` INFO:root:Downloading data from `https://omnipathdb.org/queries/complexes?format=json` INFO:root:Downloading data from `https://omnipathdb.org/queries/annotations?format=json` INFO:root:Downloading data from `https://omnipathdb.org/queries/intercell?format=json` INFO:root:Downloading data from `https://omnipathdb.org/about?format=text` INFO:root:Downloading annotations for all proteins from the following resources: `['PanglaoDB']`
| genesymbol | canonical_marker | cell_type | germ_layer | human | human_sensitivity | human_specificity | mouse | mouse_sensitivity | mouse_specificity | ncbi_tax_id | organ | ubiquitiousness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | CTRB1 | False | Enterocytes | Endoderm | True | 0.000000 | 0.004394 | True | 0.003311 | 0.020480 | 9606 | GI tract | 0.017 |
| 1 | CTRB1 | True | Acinar cells | Endoderm | True | 1.000000 | 0.000629 | True | 0.957143 | 0.015920 | 9606 | Pancreas | 0.017 |
| 2 | KLK1 | True | Endothelial cells | Mesoderm | True | 0.000000 | 0.008420 | True | 0.000000 | 0.014915 | 9606 | Vasculature | 0.013 |
| 3 | KLK1 | False | Goblet cells | Endoderm | True | 0.588235 | 0.005039 | True | 0.903226 | 0.012408 | 9606 | GI tract | 0.013 |
| 4 | KLK1 | False | Epithelial cells | Mesoderm | True | 0.000000 | 0.008233 | True | 0.225806 | 0.013758 | 9606 | Epithelium | 0.013 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8456 | SLC14A1 | True | Urothelial cells | Mesoderm | True | 0.000000 | 0.018170 | True | 0.000000 | 0.000000 | 9606 | Urinary bladder | 0.008 |
| 8457 | UPK3A | True | Urothelial cells | Mesoderm | True | 0.000000 | 0.000000 | True | 0.000000 | 0.000000 | 9606 | Urinary bladder | 0.000 |
| 8458 | UPK1A | True | Urothelial cells | Mesoderm | True | 0.000000 | 0.000000 | True | 0.000000 | 0.000000 | 9606 | Urinary bladder | 0.000 |
| 8459 | UPK2 | True | Urothelial cells | Mesoderm | True | 0.000000 | 0.000000 | True | 0.000000 | 0.000000 | 9606 | Urinary bladder | 0.000 |
| 8460 | UPK3B | True | Urothelial cells | Mesoderm | True | 0.000000 | 0.000000 | True | 0.000000 | 0.000000 | 9606 | Urinary bladder | 0.000 |
8461 rows × 13 columns
We already sorted for humans only into the current_markers_new.csv
import pandas as pd
new_marker=pd.read_csv('/home/jana/current_markers_new.csv')
data need to be framed to raw
adata.raw = adata
Over Representation Analysis (ORA): it is, employed in Decoupler, infers functional enrichment scores by utilizing expression matrices or differential expression analysis results, constructing contingency tables using set operations, and performing Fisher exact tests, with final scores log-transformed for higher significance.https://decoupler-py.readthedocs.io/en/latest/notebooks/cell_annotation.html
#ORA description
#for more information, please visit
from IPython.display import Image
Image("/home/jana/ora.png")
ORA execution
dc.run_ora(
mat=adata,
net=new_marker,
source='cell_type',
target='genesymbol',
min_n=3,
verbose=True
)
Running ora on mat with 74334 samples and 14034 targets for 118 sources.
100%|████████████████████████████████████| 74334/74334 [04:34<00:00, 271.09it/s]
ORA Estimation: The obtained scores (-log10(p-value))(ora_estimate) and p-values (ora_pvals) are stored in the .obsm key
adata.obsm['ora_estimate']
| source | Acinar cells | Adipocytes | Airway goblet cells | Alpha cells | Alveolar macrophages | Astrocytes | B cells | B cells memory | B cells naive | Basal cells | ... | Smooth muscle cells | T cells | T helper cells | T regulatory cells | Tanycytes | Taste receptor cells | Thymocytes | Trophoblast cells | Tuft cells | Urothelial cells |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AAACCCAAGAAGAGCA-1-0 | 1.071946 | 2.852549 | -0.000000 | 0.522155 | 3.388926 | 0.635484 | 1.699408 | 0.698722 | 1.229883 | -0.0 | ... | 0.441330 | 1.657983 | 0.381946 | 0.488030 | -0.000000 | -0.0 | 0.291758 | -0.0 | 0.971530 | -0.0 |
| AAACCCAAGAGAACCC-1-0 | 0.404824 | 3.530209 | -0.000000 | 0.522155 | 3.388926 | 1.210846 | 3.276940 | 1.169437 | 1.229883 | -0.0 | ... | 0.140245 | 0.660632 | 0.381946 | -0.000000 | 1.940655 | -0.0 | 0.291758 | -0.0 | 2.708921 | -0.0 |
| AAACCCAAGCGCTGAA-1-0 | 1.071946 | 2.231216 | -0.000000 | -0.000000 | 1.940655 | 0.635484 | 1.061286 | 0.336944 | 1.815805 | -0.0 | ... | 0.140245 | 0.660632 | 0.381946 | -0.000000 | -0.000000 | -0.0 | 0.291758 | -0.0 | -0.000000 | -0.0 |
| AAACCCAAGCGGGTAT-1-0 | 0.404824 | 2.231216 | 0.875532 | 1.337943 | 3.388926 | 1.210846 | 4.193602 | 1.169437 | 1.815805 | -0.0 | ... | 0.140245 | 0.660632 | -0.000000 | 1.260799 | 0.786164 | -0.0 | 0.291758 | -0.0 | 1.766035 | -0.0 |
| AAACCCAAGCTGCCTG-1-0 | -0.000000 | 0.190292 | 0.875532 | 1.337943 | 0.786164 | 0.218040 | 2.442353 | 0.336944 | 0.739536 | -0.0 | ... | 0.140245 | 11.778410 | 6.494022 | 0.488030 | -0.000000 | -0.0 | 2.332090 | -0.0 | 0.360958 | -0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| TTTGGTTTCCCAACTC-12 | -0.000000 | 0.049998 | -0.000000 | -0.000000 | 0.786164 | 0.635484 | 1.699408 | 1.734448 | 2.485523 | -0.0 | ... | -0.000000 | 11.778410 | 1.019676 | -0.000000 | 0.786164 | -0.0 | 2.332090 | -0.0 | 1.766035 | -0.0 |
| TTTGGTTTCTGCTGAA-12 | 0.404824 | 2.852549 | -0.000000 | 0.522155 | 3.388926 | 1.210846 | 1.699408 | 1.169437 | 1.815805 | -0.0 | ... | 0.140245 | 1.112777 | 0.381946 | 1.260799 | 1.940655 | -0.0 | 0.291758 | -0.0 | 1.766035 | -0.0 |
| TTTGTTGCACGTCATA-12 | 1.930534 | 0.762435 | -0.000000 | 2.367018 | 3.388926 | 0.635484 | 1.699408 | 0.336944 | 0.739536 | -0.0 | ... | 0.140245 | 1.657983 | 1.019676 | 1.260799 | 0.786164 | -0.0 | 2.332090 | -0.0 | 1.766035 | -0.0 |
| TTTGTTGGTAAGGTCG-12 | 0.404824 | 2.852549 | -0.000000 | 1.337943 | 3.388926 | 0.635484 | 2.442353 | 1.169437 | 1.815805 | -0.0 | ... | 0.441330 | 1.112777 | 0.381946 | -0.000000 | 0.786164 | -0.0 | 0.291758 | -0.0 | 0.971530 | -0.0 |
| TTTGTTGTCAACACCA-12 | 0.404824 | 1.179261 | 0.875532 | -0.000000 | 3.388926 | 0.635484 | 3.276940 | 1.169437 | 1.229883 | -0.0 | ... | -0.000000 | 0.660632 | -0.000000 | -0.000000 | 1.940655 | -0.0 | 0.811016 | -0.0 | 2.708921 | -0.0 |
74334 rows × 118 columns
ORA Visualization
acts = dc.get_acts(adata, obsm_key='ora_estimate')
# We need to remove inf and set them to the maximum value observed for pvals=0
acts_v = acts.X.ravel()
max_e = np.nanmax(acts_v[np.isfinite(acts_v)])
acts.X[~np.isfinite(acts.X)] = max_e
acts
AnnData object with n_obs × n_vars = 74334 × 118
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'umap_density_sample', 'umap_density_leiden', 'leiden_1.0', 'leiden_0.8', 'leiden_0.5', 'leiden_0.4'
uns: 'doublet_info_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.8_colors', 'leiden_1.0_colors', 'neighbors', 'phase_colors', 'sample_colors', 'umap', 'umap_density_leiden_params', 'umap_density_sample_params'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
Here NK cells and T cells: examples showing in UMAP plot, however we can do for other immune cells.
sc.pl.umap(acts, color=['NK cells', 'Alveolar macrophages','B cells','T cells','leiden_0.4','leiden_0.5', 'leiden_0.8','leiden_1.0'], cmap='RdBu_r',legend_loc="on data")
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:163: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead. /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored /home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:392: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
violin plot: NK cells and T cells
sc.pl.violin(acts, keys=['NK cells', 'Alveolar macrophages','B cells', 'T cells'], groupby='leiden_0.5', rotation= 90)
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
sc.pl.violin(acts, keys=['NK cells', 'Alveolar macrophages','B cells', 'T cells'], groupby='leiden_0.8', rotation= 90)
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
sc.pl.violin(acts, keys=['NK cells', 'Alveolar macrophages','B cells', 'T cells'], groupby='leiden_0.5', rotation= 90)
INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting. INFO:matplotlib.category:Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
sc.pl.violin(acts, keys=['NK cells'], groupby='sample', rotation= 90)
Here 'dfe' respresents: decoupler calcuates ranking genes using wilcoxon method acts represents: here annotate data object
dfe = dc.rank_sources_groups(acts, groupby='leiden_0.5', reference='rest', method='wilcoxon')
dfe
| group | reference | names | statistic | meanchange | pvals | pvals_adj | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | rest | Platelets | 68.750946 | 1.844104 | 0.0 | 0.0 |
| 1 | 0 | rest | Monocytes | 64.362208 | 2.329868 | 0.0 | 0.0 |
| 2 | 0 | rest | Leydig cells | 63.972499 | 0.779552 | 0.0 | 0.0 |
| 3 | 0 | rest | Dendritic cells | 60.773514 | 3.351986 | 0.0 | 0.0 |
| 4 | 0 | rest | Fibroblasts | 60.771825 | 0.781724 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1647 | 9 | rest | Melanocytes | -39.065973 | -0.708444 | 0.0 | 0.0 |
| 1648 | 9 | rest | Neutrophils | -39.433673 | -2.820916 | 0.0 | 0.0 |
| 1649 | 9 | rest | Adipocytes | -40.638093 | -1.832445 | 0.0 | 0.0 |
| 1650 | 9 | rest | Leydig cells | -40.859943 | -1.578088 | 0.0 | 0.0 |
| 1651 | 9 | rest | Kupffer cells | -42.151710 | -3.654825 | 0.0 | 0.0 |
1652 rows × 7 columns
n_ctypes = 3
ctypes_dict = dfe.groupby('group').head(n_ctypes).groupby('group')['names'].apply(lambda x: list(x)).to_dict()
ctypes_dict
{'0': ['Platelets', 'Monocytes', 'Leydig cells'],
'1': ['Proximal tubule cells', 'Leydig cells', 'Alveolar macrophages'],
'10': ['Adipocytes', 'Mesangial cells', 'Neutrophils'],
'11': ['Ductal cells', 'Clara cells', 'Epithelial cells'],
'12': ['NK cells', 'Gamma delta T cells', 'T cells'],
'13': ['Smooth muscle cells', 'Ependymal cells', 'Podocytes'],
'2': ['T cells', 'NK cells', 'Thymocytes'],
'3': ['Monocytes', 'Chondrocytes', 'Microglia'],
'4': ['Osteoclasts', 'Alpha cells', 'Ductal cells'],
'5': ['Smooth muscle cells', 'B cells naive', 'Satellite cells'],
'6': ['Basophils', 'Monocytes', 'Chondrocytes'],
'7': ['Gamma delta T cells', 'Germ cells', 'Schwann cells'],
'8': ['Acinar cells', 'Beta cells', 'Podocytes'],
'9': ['B cells naive', 'Plasmacytoid dendritic cells', 'B cells memory']}
matrixplot based on observation made by ctypes_dict
sc.pl.matrixplot(acts, ctypes_dict, 'leiden_0.5', dendrogram=True, standard_scale='var',
colorbar_title='Z-scaled scores', cmap='RdBu_r')
WARNING: dendrogram data not found (using key=dendrogram_leiden_0.5). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_leiden_0.5']`
Final estimation of decoupler
annotation_dict = dfe.groupby('group').head(1).set_index('group')['names'].to_dict()
annotation_dict
{'0': 'Platelets',
'1': 'Proximal tubule cells',
'10': 'Adipocytes',
'11': 'Ductal cells',
'12': 'NK cells',
'13': 'Smooth muscle cells',
'2': 'T cells',
'3': 'Monocytes',
'4': 'Osteoclasts',
'5': 'Smooth muscle cells',
'6': 'Basophils',
'7': 'Gamma delta T cells',
'8': 'Acinar cells',
'9': 'B cells naive'}
T-test: Finding marker genes
sc.tl.rank_genes_groups(adata, 'leiden_0.5', method='t-test', key_added = "t-test")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test")
# results are stored in the adata.uns["t-test"] slot
adata
ranking genes
finished: added to `.uns['t-test']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:43)
AnnData object with n_obs × n_vars = 74334 × 14034
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'umap_density_sample', 'umap_density_leiden', 'leiden_1.0', 'leiden_0.8', 'leiden_0.5', 'leiden_0.4'
uns: 'doublet_info_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.8_colors', 'leiden_1.0_colors', 'neighbors', 'phase_colors', 'sample_colors', 'umap', 'umap_density_leiden_params', 'umap_density_sample_params', 'dendrogram_leiden_0.5', 't-test'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
obsp: 'connectivities', 'distances'
T-test overestimated_variance: Finding marker genes
sc.tl.rank_genes_groups(adata, 'leiden_0.5', method='t-test_overestim_var', key_added = "t-test_ov")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test_ov")
ranking genes
finished: added to `.uns['t-test_ov']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:43)
Wilcoxon rank-sum: Finding marker genes
The result of a Wilcoxon rank-sum (Mann-Whitney-U) test is very similar to t-test and t-test t-test_overestimate.
sc.tl.rank_genes_groups(adata, 'leiden_0.5', method='wilcoxon', key_added = "wilcoxon")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key="wilcoxon")
ranking genes
finished: added to `.uns['wilcoxon']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:05:03)
adata
AnnData object with n_obs × n_vars = 74334 × 14034
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'umap_density_sample', 'umap_density_leiden', 'leiden_1.0', 'leiden_0.8', 'leiden_0.5', 'leiden_0.4'
uns: 'doublet_info_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.8_colors', 'leiden_1.0_colors', 'neighbors', 'phase_colors', 'sample_colors', 'umap', 'umap_density_leiden_params', 'umap_density_sample_params', 'dendrogram_leiden_0.5', 't-test', 't-test_ov', 'wilcoxon'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
obsp: 'connectivities', 'distances'
Here, Cluster 0/Group 0: genes mumbers showing overlap comparing 'Wilcox','T-test',and 'T-test_ov', we can do similar comparing rest of clusters: 1-14.
#compare cluster1 genes, only stores top 100 by default
wc = sc.get.rank_genes_groups_df(adata, group='0', key='wilcoxon', pval_cutoff=0.01, log2fc_min=0)['names']
tt = sc.get.rank_genes_groups_df(adata, group='0', key='t-test', pval_cutoff=0.01, log2fc_min=0)['names']
tt_ov = sc.get.rank_genes_groups_df(adata, group='0', key='t-test_ov', pval_cutoff=0.01, log2fc_min=0)['names']
from matplotlib_venn import venn3
venn3([set(wc),set(tt),set(tt_ov)], ('Wilcox','T-test','T-test_ov') )
plt.show()
Logistic regression: it has been suggested by Natranos et al. (2018)
sc.tl.rank_genes_groups(adata, 'leiden_0.5', method='logreg',key_added = "logreg")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "logreg")
ranking genes
finished: added to `.uns['logreg']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
(0:21:19)
/home/jana/my-notebook-venv/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Top five genes using "t-test", "t-test_ov", "wilcoxon", "logreg"
import pandas as pd
# Extracting and displaying the first 5 names from each DataFrame
nfs = ["t-test", "t-test_ov", "wilcoxon", "logreg"]
for nf_name in nfs:
nf = pd.DataFrame(adata.uns[nf_name]["names"]).head(5)
display(f"DataFrame for {nf_name}:")
display(nf)
'DataFrame for t-test:'
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MRC1 | FTL | CD2 | CYBB | CSTB | EEF1A1 | SOD2 | H2AFZ | CTSB | BASP1 | MT2A | SLPI | TRDC | IFI30 |
| 1 | INHBA | FTH1 | IL32 | S100A9 | APOC1 | TPT1 | GBP1 | STMN1 | PSAP | HLA-DPB1 | MT1X | WFDC2 | NKG7 | LYZ |
| 2 | C1QA | FABP4 | CD3D | S100A8 | GPNMB | FAU | CXCL10 | TUBB | MARCKS | LGALS2 | MT1F | SCGB1A1 | GNLY | MT2A |
| 3 | MCEMP1 | C1QB | TRAC | MRC1 | FTL | ZFAS1 | CCL4L2 | TUBA1B | CTSZ | PTMA | MT1G | SCGB3A1 | CCL5 | C1QA |
| 4 | ALDH2 | C1QA | CCL5 | HLA-DRA | LGALS3 | RACK1 | MARCKS | PCLAF | LGMN | FGL2 | MT1E | CAPS | KLRD1 | RNASEK |
'DataFrame for t-test_ov:'
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MRC1 | FABP4 | CD2 | CYBB | CSTB | EEF1A1 | SOD2 | H2AFZ | CTSB | BASP1 | MT2A | SLPI | TRDC | IFI30 |
| 1 | INHBA | FTH1 | IL32 | S100A8 | GPNMB | TPT1 | GBP1 | STMN1 | MARCKS | LGALS2 | MT1X | WFDC2 | NKG7 | MT2A |
| 2 | MCEMP1 | FTL | TRAC | S100A9 | APOC1 | FAU | CXCL10 | PCLAF | LGMN | MEF2C | MT1F | SCGB1A1 | GNLY | MT1X |
| 3 | PNPLA6 | APOC1 | CD3D | MRC1 | CTSL | ZFAS1 | CCL4L2 | TUBB | RNASE1 | FGL2 | MT1G | SCGB3A1 | CCL5 | MT1E |
| 4 | TCF7L2 | C1QB | CCL5 | VCAN | GCHFR | RACK1 | TNFAIP6 | TYMS | FPR3 | PLEKHO1 | MT1E | CAPS | KLRD1 | RNASEK |
'DataFrame for wilcoxon:'
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MRC1 | S100A11 | CD2 | CYBB | CSTB | EEF1A1 | SOD2 | H2AFZ | CTSB | PTMA | MT2A | SLPI | NKG7 | IFI30 |
| 1 | INHBA | LGALS3 | IL32 | S100A8 | APOC1 | FAU | GBP1 | TUBB | PSAP | HLA-DPB1 | MT1X | SCGB3A1 | TRDC | MT2A |
| 2 | TCF7L2 | TSPO | CD3D | S100A9 | GPNMB | TPT1 | MARCKS | STMN1 | MARCKS | BASP1 | MT1F | WFDC2 | GNLY | MT1X |
| 3 | PNPLA6 | APOC1 | LTB | CD44 | CD63 | NACA | CXCL10 | HMGB1 | FPR3 | EEF1B2 | MT1E | SCGB1A1 | CCL5 | RNASEK |
| 4 | LPL | FTH1 | TRAC | MRC1 | FTL | PFDN5 | SNX10 | TUBA1B | NPC2 | RACK1 | MT1G | CAPS | PTMA | ATP6V0C |
'DataFrame for logreg:'
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HLA-DRB1 | B2M | IL32 | VCAN | GPNMB | TPT1 | CCL4 | H2AFZ | RNASE1 | HLA-DPB1 | MT2A | SCGB1A1 | GNLY | MT2A |
| 1 | NEAT1 | FTH1 | CD2 | CYBB | CCL18 | FAU | CXCL10 | STMN1 | LGMN | CCL17 | MT1X | SCGB3A1 | TRDC | IFI30 |
| 2 | PTPRC | TPT1 | TRAC | OLR1 | CSTB | EEF1A1 | CCL4L2 | TUBB | MAFB | HLA-DPA1 | MT1G | SLPI | NKG7 | MT1X |
| 3 | MARCO | CXCL8 | LTB | CD44 | OTOA | NACA | SOD2 | HIST1H4C | A2M | BASP1 | MT1E | WFDC2 | KLRD1 | MT1G |
| 4 | MRC1 | FTL | CD3G | FCN1 | LGALS1 | HLA-DQA2 | GBP1 | TOP2A | ZFP36L1 | CD74 | MT1F | TFF3 | CTSW | MT1E |
print("Heat Map")
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, key="wilcoxon", groupby="leiden_0.5", show_gene_labels=True)
print("Dot Map")
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, key="wilcoxon", groupby="leiden_0.5")
print("Stacked Violin")
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5, key="wilcoxon", groupby="leiden_0.5")
print("matrixplot")
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5, key="wilcoxon", groupby="leiden_0.5")
Heat Map
Dot Map
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Stacked Violin
matrixplot
import scanpy as sc
import matplotlib.pyplot as plt
# Assuming 'leiden_0.5' is the column containing cluster information
clusters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
for cluster_id in clusters:
cl = adata[adata.obs['leiden_0.5'] == cluster_id]
print("Cluster:", cluster_id)
print(cl.obs['type'].value_counts())
sc.tl.rank_genes_groups(cl, 'type', method='wilcoxon')
sc.pl.rank_genes_groups(cl, n_genes=25, sharey=False, method='wilcoxon')
sc.pl.rank_genes_groups_dotplot(cl, n_genes=5, groupby="type")
sc.pl.rank_genes_groups_dotplot(cl, n_genes=5, groupby="sample")
plt.show()
Cluster: 0 Healthy 11945 Sarcoidosis 4540 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:34)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 1 Healthy 7669 Sarcoidosis 4609 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:22)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 2 Healthy 6306 Sarcoidosis 4008 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:18)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 3 Healthy 5372 Sarcoidosis 3322 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:15)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 4 Healthy 5335 Sarcoidosis 2262 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:14)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 5 Healthy 4565 Sarcoidosis 2245 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:11)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 6 Healthy 1689 Sarcoidosis 1667 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:06)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 7 Healthy 1467 Sarcoidosis 1154 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:05)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 8 Sarcoidosis 1446 Healthy 1153 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 9 Healthy 633 Sarcoidosis 591 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:02)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 10 Healthy 768 Sarcoidosis 236 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:02)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 11 Healthy 345 Sarcoidosis 306 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Cluster: 12 Sarcoidosis 350 Healthy 300 Name: type, dtype: int64 ranking genes
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:580: ImplicitModificationWarning: Trying to modify attribute `._uns` of view, initializing view as actual.
finished: added to `.uns['rank_genes_groups']`
'names', sorted np.recarray to be indexed by group ids
'scores', sorted np.recarray to be indexed by group ids
'logfoldchanges', sorted np.recarray to be indexed by group ids
'pvals', sorted np.recarray to be indexed by group ids
'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:01)
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 20
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: BAL-Sarc-1, BAL-Sarc-2, BAL-Sarc-3, etc.
var_group_labels: Healthy, Sarcoidosis
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
Azimuth Lung Cell cell Markers Genes https://azimuth.hubmapconsortium.org/references/#Human%20-%20Lung%20v2%20%28HLCA%29
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
# azimiuth_gene_lists
azimiuth_gene_lists = {
"Alveolar macrophages": ['HLA-DPB1', 'HLA-DPA1', 'CTSZ', 'ACP5', 'COTL1', 'FCER1G', 'C1QC', 'LAPTM5', 'CTSS'],
"Macrophages": ['C1QB', 'C1QA', 'HLA-DRB1', 'AIF1', 'LYZ', 'CTSZ', 'CTSL', 'FCER1G', 'C1QC', 'LAPTM5'],
"Classical monocytes": ['LST1', 'IL1B', 'LYZ', 'COTL1', 'S100A9', 'VCAN', 'S100A8', 'S100A12', 'AIF1', 'FCN1'],
"Epithelial": ['ELF3', 'CYB5A', 'KRT8', 'KRT19', 'TACSTD2', 'MUC1', 'S100A14'],
"B cell lineage": ['TSC22D3', 'ISG20', 'CORO1A', 'LTB', 'CD79A', 'CD37'],
"CD8 T cells": ['CD8A', 'CD3E', 'CCL4', 'CD2', 'CXCR4', 'GZMA', 'NKG7', 'IL32', 'CD3D', 'CCL5'],
"CD4 T cells": ['CORO1A', 'KLRB1', 'CD3E', 'LTB', 'CXCR4', 'IL7R', 'TRAC', 'IL32', 'CD2', 'CD3D'],
"DC2": ['ITGB2', 'LAPTM5', 'HLA-DRB1', 'HLA-DPB1', 'HLA-DPA1', 'HLA-DMB', 'HLA-DQB1', 'HLA-DQA1', 'HLA-DMA', 'LST1'],
"DC1": ['HLA-DPA1', 'CPNE3', 'CORO1A', 'CPVL', 'C1orf54', 'WDFY4', 'LSP1', 'HLA-DQB1', 'HLA-DQA1', 'HLA-DMA'],
"Treg": ['RTKN2', 'AC133644.2', 'CD4', 'IL2RA', 'TIGIT', 'CTLA4', 'IKZF2'],
"NK cells": ['GNLY', 'TYROBP', 'NKG7', 'FCER1G', 'GZMB', 'TRDC', 'PRF1', 'SPON2', 'KLRF1']
}
sc.pl.matrixplot(
adata,
azimiuth_gene_lists,
"leiden_0.5",
dendrogram=True,
cmap="Blues",
standard_scale="var",
colorbar_title="column scaled\nexpression",
)
ax1 = sc.pl.heatmap(
adata, azimiuth_gene_lists, groupby="leiden_0.5", cmap="viridis", dendrogram=True
)
ax2 = sc.pl.dotplot(
adata, azimiuth_gene_lists, groupby="leiden_0.5"
)
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: 0, 1, 2, etc. var_group_labels: Alveolar macrophages, Macrophages, Classical monocytes, etc.
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different. categories: 0, 1, 2, etc. var_group_labels: Alveolar macrophages, Macrophages, Classical monocytes, etc. WARNING: Gene labels are not shown when more than 50 genes are visualized. To show gene labels set `show_gene_labels=True`
/home/jana/my-notebook-venv/lib/python3.8/site-packages/scanpy/plotting/_dotplot.py:749: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap', 'norm' will be ignored
#import scipy io package
from scipy import io
save_file = '/home/jana/Updated_SCANPY_QC_filtered_BAL_for_Sarcoidosis.h5ad'
adata.write_h5ad(save_file)
adata
AnnData object with n_obs × n_vars = 74334 × 14034
obs: 'type', 'sample', 'percent_chrY', 'XIST-counts', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'leiden', 'leiden_res0_20', 'leiden_res0_40', 'leiden_res0_60', 'leiden_res0_80', 'leiden_res1', 'S_score', 'G2M_score', 'phase', 'batch', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'umap_density_sample', 'umap_density_leiden', 'leiden_1.0', 'leiden_0.8', 'leiden_0.5', 'leiden_0.4'
uns: 'doublet_info_colors', 'leiden', 'leiden_0.4_colors', 'leiden_0.5_colors', 'leiden_0.8_colors', 'leiden_1.0_colors', 'neighbors', 'phase_colors', 'sample_colors', 'umap', 'umap_density_leiden_params', 'umap_density_sample_params', 'dendrogram_leiden_0.5', 't-test', 't-test_ov', 'wilcoxon', 'logreg'
obsm: 'X_pca', 'X_pca_harmony', 'X_umap', 'ora_estimate', 'ora_pvals'
obsp: 'connectivities', 'distances'